! (C) Copyright 2000- ECMWF.
! (C) Copyright 2000- Meteo-France.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE GATH_GRID_CTL_MOD
CONTAINS
SUBROUTINE GATH_GRID_CTL(PGPG,KFGATHG,KPROMA,KTO,PGP)

!**** *GATH_GRID_CTL* - Gather global gridpoint array from processors

!     Purpose.
!     --------
!        Routine for gathering gridpoint array

!**   Interface.
!     ----------
!     CALL GATH_GRID_CTL(...)

!     Explicit arguments :
!     --------------------
!     PGPG(:,:)   - Global gridpoint array
!     KFGATHG     - Global number of fields to be gathered
!     KPROMA      - blocking factor for gridpoint input
!     KTO(:)      - Processor responsible for gathering each field
!     PGP(:,:,:)  - Local gridpoint array
!
!     ------------------------------------------------------------------


USE PARKIND1  ,ONLY : JPIM     ,JPRB
USE MPL_MODULE  ,ONLY : MPL_ALLTOALLV, MPL_RECV, MPL_SEND, MPL_WAIT,   &
     &                  JP_BLOCKING_STANDARD, JP_NON_BLOCKING_STANDARD

USE TPM_GEOMETRY    ,ONLY : G
USE TPM_DISTR       ,ONLY : D, MTAGDISTSP, NPRCIDS, MYPROC, NPROC

USE SET2PE_MOD      ,ONLY : SET2PE
USE EQ_REGIONS_MOD  ,ONLY : N_REGIONS, N_REGIONS_NS

IMPLICIT NONE

! Declaration of arguments

REAL(KIND=JPRB)    ,OPTIONAL, INTENT(OUT) :: PGPG(:,:)
INTEGER(KIND=JPIM)          , INTENT(IN)  :: KFGATHG
INTEGER(KIND=JPIM)          , INTENT(IN)  :: KPROMA
INTEGER(KIND=JPIM)          , INTENT(IN)  :: KTO(:)
REAL(KIND=JPRB)             , INTENT(IN)  :: PGP(:,:,:)

! Declaration of local variables

REAL(KIND=JPRB)    :: ZFLD(D%NGPTOTMX*KFGATHG)
REAL(KIND=JPRB),ALLOCATABLE :: ZBUF(:)
INTEGER(KIND=JPIM),ALLOCATABLE :: IREQ(:)
INTEGER(KIND=JPIM) :: IFLDR,JFLD,ITAG,ILEN,JA,JB,ISND,JGL,JLON,ILOFF
INTEGER(KIND=JPIM) :: IRCV,IOFF,ILAST,IGL1,IGL2,IGLOFF,IR
INTEGER(KIND=JPIM) :: JKGLO,JROF,IEND,J,IBL,IPROC,JROC,IMYFIELDS,ILRECV
INTEGER(KIND=JPIM) :: ISENDREQ(KFGATHG),ITO
INTEGER(KIND=JPIM) :: ILENS(NPROC),IOFFS(NPROC),ILENR(NPROC),IOFFR(NPROC)
INTEGER(KIND=JPIM) :: IFLDL,IFLDS
LOGICAL :: LLSAME
!     ------------------------------------------------------------------


!GATHER SPECTRAL ARRAY

IF( NPROC == 1 ) THEN
  CALL GSTATS(1643,0)
!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF)
  DO JKGLO=1,D%NGPTOT,KPROMA
    IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1)
    IOFF = JKGLO-1
    IBL  = (JKGLO-1)/KPROMA+1
    DO JFLD=1,KFGATHG
      DO JROF=1,IEND
        PGPG(IOFF+JROF,JFLD) = PGP(JROF,JFLD,IBL)
      ENDDO
    ENDDO
  ENDDO
!$OMP END PARALLEL DO
  CALL GSTATS(1643,1)

ELSE
! test if values in KTO are all the same
  LLSAME=.TRUE.
  ITO=KTO(1)
  DO JFLD=2,KFGATHG
    IF(KTO(JFLD) /= ITO) THEN
      LLSAME=.FALSE.
      EXIT
    ENDIF
  ENDDO
! IF( MYPROC==1 )THEN
!   WRITE(0,'("GATH_GRID_CTL DEBUG: LLSAME=",L1)')LLSAME
!   DO JFLD=1,KFGATHG
!     WRITE(0,'("GATH_GRID_CTL DEBUG:KFGATHG,JFLD,KTO=",3(2X,I6))')KFGATHG,JFLD,KTO(JFLD)
!   ENDDO
! ENDIF

  IFLDL=D%NGPTOTMX
  IF(LLSAME) THEN
    CALL GSTATS(1643,0)
    !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JKGLO,IEND,IOFF,IBL,JFLD,JROF)
    DO JFLD=1,KFGATHG
      DO JKGLO=1,D%NGPTOT,KPROMA
        IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1)
        IOFF = JKGLO-1
        IBL  = (JKGLO-1)/KPROMA+1
        DO JROF=1,IEND
          ZFLD(IOFF+JROF+(JFLD-1)*IFLDL) = PGP(JROF,JFLD,IBL)
        ENDDO
      ENDDO
    ENDDO
    !$OMP END PARALLEL DO
    CALL GSTATS(1643,1)
  ELSE
    ILENS(:)=0
    IOFFS(:)=0
    ILENR(:)=0
    IOFFR(:)=0
    DO JFLD=1,KFGATHG
      ILENS(KTO(JFLD))=ILENS(KTO(JFLD))+IFLDL
      IF(KTO(JFLD) == MYPROC) THEN
        ILENR(:)=ILENR(:)+IFLDL
      ENDIF
    ENDDO
    DO JROC=2,NPROC
      IOFFR(JROC)=IOFFR(JROC-1)+ ILENR(JROC-1)
      IOFFS(JROC)=IOFFS(JROC-1)+ ILENS(JROC-1)
    ENDDO
    IFLDS=0
    DO JROC=1,NPROC
      IF(ILENS(JROC) > 0) THEN
        DO JFLD=1,KFGATHG
          IF(KTO(JFLD) == JROC) THEN
            DO JKGLO=1,D%NGPTOT,KPROMA
              IEND = MIN(KPROMA,D%NGPTOT-JKGLO+1)
              IOFF = JKGLO-1
              IBL  = (JKGLO-1)/KPROMA+1
              DO JROF=1,IEND
                ZFLD(IOFF+JROF+IFLDS*IFLDL) = PGP(JROF,JFLD,IBL)
              ENDDO
            ENDDO
            IFLDS=IFLDS+1
          ENDIF
        ENDDO
      ENDIF
    ENDDO
  ENDIF
          
  IMYFIELDS = 0
  DO JFLD=1,KFGATHG
    IF(KTO(JFLD) == MYPROC) THEN
      IMYFIELDS = IMYFIELDS+1
    ENDIF
  ENDDO

  IF(IMYFIELDS > 0) THEN
    ALLOCATE(ZBUF(D%NGPTOTMX*IMYFIELDS*NPROC))
  ELSE
    ALLOCATE(ZBUF(1))
  ENDIF
  IFLDR = 0
  CALL GSTATS_BARRIER(789)
  CALL GSTATS(809,0)

  IF( LLSAME )THEN
    !Send
    ISND  = KTO(1)
    ITAG  = MTAGDISTSP+1+17
    CALL MPL_SEND(ZFLD,KDEST=NPRCIDS(ISND),KTAG=ITAG,&
     &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=ISENDREQ(1),&
     &CDSTRING='GATH_GRID_CTL:')
    ! RECIEVE
    IF(KTO(1) == MYPROC) THEN
      IFLDR = KFGATHG
      DO JROC=1,NPROC
        ITAG  = MTAGDISTSP+1+17
        IRCV  = JROC
        IOFF=IFLDL*KFGATHG*(JROC-1)
        CALL MPL_RECV(ZBUF(IOFF+1:IOFF+IFLDL*KFGATHG),KSOURCE=NPRCIDS(IRCV),&
         &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILRECV,&
         &KTAG=ITAG,CDSTRING='GATH_GRID_CTL:')
      ENDDO
    ENDIF
    CALL MPL_WAIT(KREQUEST=ISENDREQ(1), &
     & CDSTRING='GATH_GRID_CTL: WAIT')
  ELSE
    IFLDR=IMYFIELDS

!   ALLTOALLV performance is really slow when number of fields (KFGATHG) is << NPROC
!   This was for IBM - and RECV/SEND alternative causes problems for large number of MPI tasks.

!   IF( KFGATHG >= NPROC/8 )THEN
    IF( .TRUE. )THEN
      CALL MPL_ALLTOALLV(PSENDBUF=ZFLD,KSENDCOUNTS=ILENS,&
       & PRECVBUF=ZBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,&
       & CDSTRING='GATH_GRID_CTL:')
    ELSE
      IR=0
      DO JFLD=1,KFGATHG
        IF(KTO(JFLD) == MYPROC) THEN
          IR=IR+NPROC
        ENDIF
      ENDDO
      IR=IR+KFGATHG
      ALLOCATE(IREQ(IR))
      IR=0
      ITAG  = MTAGDISTSP+1+17
      DO JROC=1,NPROC
        DO JFLD=1,KFGATHG
          IF(KTO(JFLD) == MYPROC) THEN
            IRCV  = JROC
            IR=IR+1
            CALL MPL_RECV(ZBUF(1+IOFFR(IRCV):IOFFR(IRCV)+ILENR(IRCV)),KSOURCE=NPRCIDS(IRCV),KTAG=ITAG,&
             &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR),&
             &CDSTRING='GATH_GRID_CTL:')
          ENDIF
        ENDDO
      ENDDO
      DO JFLD=1,KFGATHG
        ISND  = KTO(JFLD)
        IR=IR+1
        CALL MPL_SEND(ZFLD(1+IOFFS(ISND):IOFFS(ISND)+ILENS(ISND)),KDEST=NPRCIDS(ISND),KTAG=ITAG,&
         &KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR),&
         &CDSTRING='GATH_GRID_CTL:')
      ENDDO
      CALL MPL_WAIT(KREQUEST=IREQ(1:IR), &
       & CDSTRING='GATH_GRID_CTL: WAIT')
      DEALLOCATE(IREQ)
    ENDIF
  ENDIF
  
  CALL GSTATS(809,1)
  CALL GSTATS_BARRIER2(789) 
  CALL GSTATS(1643,0)
!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1)&
!$OMP&PRIVATE(JA,JB,IPROC,IGLOFF,IGL1,IGL2,IOFF,ILAST,J,&
!$OMP&ILEN,ILOFF,JGL,JLON,JFLD)
  DO JFLD=1,IFLDR
    DO JA=1,N_REGIONS_NS
      DO JB=1,N_REGIONS(JA)
        CALL SET2PE(IPROC,JA,JB,0,0)
        IGLOFF = D%NPTRFRSTLAT(JA)
        IGL1 = D%NFRSTLAT(JA)
        IGL2 = D%NLSTLAT(JA)
        IOFF = 0
        IF(JA > 1) THEN
          IF( D%NLSTLAT(JA-1) == D%NFRSTLAT(JA) )THEN
            ILAST = D%NLSTLAT(JA-1)-1
          ELSE
            ILAST = D%NLSTLAT(JA-1)
          ENDIF
          DO J=D%NFRSTLAT(1),ILAST
            IOFF = IOFF+G%NLOEN(J)
          ENDDO
        ENDIF

        ILEN = 0
        ILOFF = 0
        DO JGL=IGL1,IGL2
          DO JLON=1,D%NONL(IGLOFF+JGL-IGL1,JB)
            PGPG(IOFF+ILOFF+D%NSTA(IGLOFF+JGL-IGL1,JB)+JLON-1,JFLD) = &
             & ZBUF(ILEN+JLON+(JFLD-1)*IFLDL+(IPROC-1)*IFLDL*IMYFIELDS)
          ENDDO
          ILEN = ILEN + D%NONL(IGLOFF+JGL-IGL1,JB)
          ILOFF = ILOFF + G%NLOEN(JGL)
        ENDDO
      ENDDO
    ENDDO
  ENDDO
!$OMP END PARALLEL DO

  CALL GSTATS(1643,1)
! Synhronize processors
! Should not be necessary
!!$  CALL GSTATS(784,0)
!!$  CALL MPL_BARRIER(CDSTRING='GATH_GRID_CTL:')
!!$  CALL GSTATS(784,1)
  IF(ALLOCATED(ZBUF)) DEALLOCATE(ZBUF)
ENDIF

!     ------------------------------------------------------------------

END SUBROUTINE GATH_GRID_CTL
END MODULE GATH_GRID_CTL_MOD


